<!DOCTYPE html>
<html lang="zh-cn">
<head>
  <meta charset="utf-8">
  <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
  <title>02-网络爬虫基础 - vzvixb</title>
  <meta name="renderer" content="webkit" />
<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1"/>

<meta http-equiv="Cache-Control" content="no-transform" />
<meta http-equiv="Cache-Control" content="no-siteapp" />

<meta name="theme-color" content="#f8f5ec" />
<meta name="msapplication-navbutton-color" content="#f8f5ec">
<meta name="apple-mobile-web-app-capable" content="yes">
<meta name="apple-mobile-web-app-status-bar-style" content="#f8f5ec">


<meta name="author" content="even" /><meta name="description" content="什么是网络爬虫？ 获取url地址，发现url规律，请求网络并提取u数据的自动化程序 爬虫基本流程 1. 发送请求 通过HTTP库向目标网站点发起请求，即" /><meta name="keywords" content="Hugo, theme, even" />






<meta name="generator" content="Hugo 0.90.1 with theme even" />


<link rel="canonical" href="https://zhouxiaoxin.gitee.io/post/python/02.%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB%E5%9F%BA%E7%A1%80/" />
<link rel="apple-touch-icon" sizes="180x180" href="/apple-touch-icon.png">
<link rel="icon" type="image/png" sizes="32x32" href="/favicon-32x32.png">
<link rel="icon" type="image/png" sizes="16x16" href="/favicon-16x16.png">
<link rel="manifest" href="/manifest.json">
<link rel="mask-icon" href="/safari-pinned-tab.svg" color="#5bbad5">

<script async src="//busuanzi.ibruce.info/busuanzi/2.3/busuanzi.pure.mini.js"></script>

<link href="/sass/main.min.32d4dc642fec98c34c80bebb9c784c50771712b4a8a25d9f4dd9cce3534b426e.css" rel="stylesheet">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fancyapps/fancybox@3.1.20/dist/jquery.fancybox.min.css" integrity="sha256-7TyXnr2YU040zfSP+rEcz29ggW4j56/ujTPwjMzyqFY=" crossorigin="anonymous">


<meta property="og:title" content="02-网络爬虫基础" />
<meta property="og:description" content="什么是网络爬虫？ 获取url地址，发现url规律，请求网络并提取u数据的自动化程序 爬虫基本流程 1. 发送请求 通过HTTP库向目标网站点发起请求，即" />
<meta property="og:type" content="article" />
<meta property="og:url" content="https://zhouxiaoxin.gitee.io/post/python/02.%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB%E5%9F%BA%E7%A1%80/" /><meta property="article:section" content="post" />
<meta property="article:published_time" content="2018-09-08T16:00:56+08:00" />
<meta property="article:modified_time" content="2018-09-08T16:00:56+08:00" />

<meta itemprop="name" content="02-网络爬虫基础">
<meta itemprop="description" content="什么是网络爬虫？ 获取url地址，发现url规律，请求网络并提取u数据的自动化程序 爬虫基本流程 1. 发送请求 通过HTTP库向目标网站点发起请求，即"><meta itemprop="datePublished" content="2018-09-08T16:00:56+08:00" />
<meta itemprop="dateModified" content="2018-09-08T16:00:56+08:00" />
<meta itemprop="wordCount" content="845">
<meta itemprop="keywords" content="爬虫,Python," /><meta name="twitter:card" content="summary"/>
<meta name="twitter:title" content="02-网络爬虫基础"/>
<meta name="twitter:description" content="什么是网络爬虫？ 获取url地址，发现url规律，请求网络并提取u数据的自动化程序 爬虫基本流程 1. 发送请求 通过HTTP库向目标网站点发起请求，即"/>

<!--[if lte IE 9]>
  <script src="https://cdnjs.cloudflare.com/ajax/libs/classlist/1.1.20170427/classList.min.js"></script>
<![endif]-->

<!--[if lt IE 9]>
  <script src="https://cdn.jsdelivr.net/npm/html5shiv@3.7.3/dist/html5shiv.min.js"></script>
  <script src="https://cdn.jsdelivr.net/npm/respond.js@1.4.2/dest/respond.min.js"></script>
<![endif]-->

</head>
<body>
  <div id="mobile-navbar" class="mobile-navbar">
  <div class="mobile-header-logo">
    <a href="/" class="logo">Even</a>
  </div>
  <div class="mobile-navbar-icon">
    <span></span>
    <span></span>
    <span></span>
  </div>
</div>
<nav id="mobile-menu" class="mobile-menu slideout-menu">
  <ul class="mobile-menu-list">
    <a href="/">
        <li class="mobile-menu-item">Home</li>
      </a><a href="/post/">
        <li class="mobile-menu-item">Archs</li>
      </a><a href="/tags/">
        <li class="mobile-menu-item">Tags</li>
      </a><a href="/categories/">
        <li class="mobile-menu-item">Cates</li>
      </a><a href="/about/">
        <li class="mobile-menu-item">About</li>
      </a><a href="/pages/runoob/">
        <li class="mobile-menu-item">runoob</li>
      </a><a href="/pages/98wubi/">
        <li class="mobile-menu-item">98wubi</li>
      </a>
  </ul>
</nav>
  <div class="container" id="mobile-panel">
    <header id="header" class="header">
        <div class="logo-wrapper">
  <a href="/" class="logo">Even</a>
</div>

<nav class="site-navbar">
  <ul id="menu" class="menu">
    <li class="menu-item">
        <a class="menu-item-link" href="/">Home</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="/post/">Archs</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="/tags/">Tags</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="/categories/">Cates</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="/about/">About</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="/pages/runoob/">runoob</a>
      </li><li class="menu-item">
        <a class="menu-item-link" href="/pages/98wubi/">98wubi</a>
      </li>
  </ul>
</nav>
    </header>

    <main id="main" class="main">
      <div class="content-wrapper">
        <div id="content" class="content">
          <article class="post">
    
    <header class="post-header">
      <h1 class="post-title">02-网络爬虫基础</h1>

      <div class="post-meta">
        <span class="post-time"> 2018-09-08 </span>
        <div class="post-category">
            <a href="/categories/python/"> Python </a>
            </div>
          <span class="more-meta"> 约 845 字 </span>
          <span class="more-meta"> 预计阅读 2 分钟 </span>
        <span id="busuanzi_container_page_pv" class="more-meta"> <span id="busuanzi_value_page_pv"><img src="/img/spinner.svg" alt="spinner.svg"/></span> 次阅读 </span>
      </div>
    </header>

    <div class="post-toc" id="post-toc">
  <h2 class="post-toc-title">文章目录</h2>
  <div class="post-toc-content">
    <nav id="TableOfContents">
  <ul>
    <li><a href="#1-发送请求">1. 发送请求</a></li>
    <li><a href="#2-获取响应内容">2. 获取响应内容</a></li>
    <li><a href="#3-解析内容">3. 解析内容</a></li>
    <li><a href="#4-保存数据">4. 保存数据</a></li>
  </ul>

  <ul>
    <li><a href="#request中包含什么">Request中包含什么?</a></li>
    <li><a href="#response中包含什么">Response中包含什么</a></li>
  </ul>

  <ul>
    <li><a href="#怎样解决javascritp渲染的问题">怎样解决JavaScritp渲染的问题？</a></li>
  </ul>
</nav>
  </div>
</div>
    <div class="post-content">
      <h1 id="什么是网络爬虫">什么是网络爬虫？</h1>
<p><strong>获取url地址，发现url规律，请求网络并提取u数据的自动化程序</strong></p>
<h1 id="爬虫基本流程">爬虫基本流程</h1>
<h2 id="1-发送请求">1. 发送请求</h2>
<p><img src="./img/01-%E7%88%AC%E8%99%AB%E5%9F%BA%E6%9C%AC%E6%B5%81%E7%A8%8B.png" alt="爬虫基本流程"></p>
<p>通过HTTP库向目标网站点发起请求，即发送
一个Request,请求可以包含额外的headers等信息，和服务器响应</p>
<h2 id="2-获取响应内容">2. 获取响应内容</h2>
<p>如果服务器能正常响应，会得到一个Response,Response的内容便是所要获取得页面内容，类型可能有HTML，JSON字符串，二进制数据(如：图片视频等) 等类型
。</p>
<h2 id="3-解析内容">3. 解析内容</h2>
<p>得到的内容可能是<strong>HTML</strong>，可以用正则表达式、网页解析库(<code>xpath,pyquery,beautifulsoup</code>等)进行解析，可能是<strong>Json</strong>，可以直接转换json对象解析，可能是<strong>二进制数据</strong>，可以做保存或者进一步的处理。</p>
<h2 id="4-保存数据">4. 保存数据</h2>
<p>保存形式多样，k可以存为文本(txt,json),也可以保存数据库,或者保存特定格式的文件(csv,excel,word)。</p>
<h1 id="什么是request和response">什么是Request和Response</h1>
<p><img src="./img/02-Request%E4%B8%8EResponse.png" alt="Request和Response"></p>
<ul>
<li>Request 浏览器发送消息给该网址所在的服务器,这个过程就叫做HTTP Request。</li>
<li>Response 服务器收到浏览器发送de消息后，能够更具浏览器发送消息的内容，做相应处理，然后把消息回传给浏览器。这个过程叫做HTTP Response。</li>
<li>浏览器收到服务器的Response信息后,会对信息进行相应的处理，然后展示。</li>
</ul>
<h2 id="request中包含什么">Request中包含什么?</h2>
<ul>
<li>请求方式</li>
<li>请求URL</li>
<li>请求头</li>
<li>请求体
<img src="./img/03-Request%E8%AF%B7%E6%B1%82.png" alt="Request中包含什么"></li>
</ul>
<h2 id="response中包含什么">Response中包含什么</h2>
<ul>
<li>响应状态</li>
<li>响应头</li>
<li>响应数据
<img src="./img/04-Response%E5%93%8D%E5%BA%94.png" alt="Response中包含什么"></li>
</ul>
<h1 id="能抓怎样的数据">能抓怎样的数据？</h1>
<ol>
<li>网页文本</li>
<li>图片(二进制)</li>
<li>视频(二进制)</li>
<li>其它</li>
</ol>
<h1 id="怎样来解析">怎样来解析？</h1>
<ul>
<li>json解析</li>
<li>正则表达式</li>
<li>xpath</li>
<li>PyQuery</li>
<li>BeautifulSoup</li>
</ul>
<h1 id="为什么我抓到的和浏览器看到的不一样">为什么我抓到的和浏览器看到的不一样？</h1>
<blockquote>
<p>由于网页ajax请求，可以通过js来改变html的结构和状态，所以导致我们请求的html和浏览器请求的html不一致</p>
</blockquote>
<h2 id="怎样解决javascritp渲染的问题">怎样解决JavaScritp渲染的问题？</h2>
<ol>
<li>分析Ajax请求</li>
<li>Selenium/WebDriver(自动化测试的工具)</li>
<li>Splash
4。 PyV8 、 Ghost.py</li>
</ol>
<h1 id="怎样来保存数据">怎样来保存数据</h1>
<ul>
<li>文本(txt,json,xml)</li>
<li>关系数据库(mysql)</li>
<li>非关系数据库(mongodb)</li>
<li>二进制w附件(图片，音视频)</li>
</ul>

    </div>

    <div class="post-copyright">
  <p class="copyright-item">
    <span class="item-title">文章作者</span>
    <span class="item-content">even</span>
  </p>
  <p class="copyright-item">
    <span class="item-title">上次更新</span>
    <span class="item-content">
        2018-09-08
        
    </span>
  </p>
  
  
</div>
<footer class="post-footer">
      <div class="post-tags">
          <a href="/tags/%E7%88%AC%E8%99%AB/">爬虫</a>
          <a href="/tags/python/">Python</a>
          </div>
      <nav class="post-nav">
        <a class="prev" href="/post/1/%E5%89%8D%E7%AB%AF%E7%9F%A5%E8%AF%86%E7%9B%AE%E5%BD%95/">
            <i class="iconfont icon-left"></i>
            <span class="prev-text nav-default">前端知识目录</span>
            <span class="prev-text nav-mobile">上一篇</span>
          </a>
        <a class="next" href="/post/python/03.urllib%E5%BA%93%E8%AF%A6%E8%A7%A3/">
            <span class="next-text nav-default">03-urllib库详解</span>
            <span class="next-text nav-mobile">下一篇</span>
            <i class="iconfont icon-right"></i>
          </a>
      </nav>
    </footer>
  </article>
        </div>
        

  

  

      </div>
    </main>

    <footer id="footer" class="footer">
      <div class="social-links">
      <a href="mailto:your@email.com" class="iconfont icon-email" title="email"></a>
      <a href="http://localhost:1313" class="iconfont icon-stack-overflow" title="stack-overflow"></a>
      <a href="http://localhost:1313" class="iconfont icon-twitter" title="twitter"></a>
      <a href="http://localhost:1313" class="iconfont icon-facebook" title="facebook"></a>
      <a href="http://localhost:1313" class="iconfont icon-linkedin" title="linkedin"></a>
      <a href="http://localhost:1313" class="iconfont icon-google" title="google"></a>
      <a href="http://localhost:1313" class="iconfont icon-github" title="github"></a>
      <a href="http://localhost:1313" class="iconfont icon-weibo" title="weibo"></a>
      <a href="http://localhost:1313" class="iconfont icon-zhihu" title="zhihu"></a>
      <a href="http://localhost:1313" class="iconfont icon-douban" title="douban"></a>
      <a href="http://localhost:1313" class="iconfont icon-pocket" title="pocket"></a>
      <a href="http://localhost:1313" class="iconfont icon-tumblr" title="tumblr"></a>
      <a href="http://localhost:1313" class="iconfont icon-instagram" title="instagram"></a>
      <a href="http://localhost:1313" class="iconfont icon-gitlab" title="gitlab"></a>
      <a href="http://localhost:1313" class="iconfont icon-bilibili" title="bilibili"></a>
  <a href="https://zhouxiaoxin.gitee.io/index.xml" type="application/rss+xml" class="iconfont icon-rss" title="rss"></a>
</div>

<div class="copyright">
  <span class="power-by">
    由 <a class="hexo-link" href="https://gohugo.io">Hugo</a> 强力驱动
  </span>
  <span class="division">|</span>
  <span class="theme-info">
    主题 - 
    <a class="theme-link" href="https://github.com/olOwOlo/hugo-theme-even">Even</a>
  </span>

  <div class="busuanzi-footer">
    <span id="busuanzi_container_site_pv"> 本站总访问量 <span id="busuanzi_value_site_pv"><img src="/img/spinner.svg" alt="spinner.svg"/></span> 次 </span>
      <span class="division">|</span>
    <span id="busuanzi_container_site_uv"> 本站总访客数 <span id="busuanzi_value_site_uv"><img src="/img/spinner.svg" alt="spinner.svg"/></span> 人 </span>
  </div>

  <span class="copyright-year">
    &copy; 
    2018 - 
    2022
    <span class="heart">
      <i class="iconfont icon-heart"></i>
    </span>
    <span class="author">even</span>
  </span>
</div>
    </footer>

    <div class="back-to-top" id="back-to-top">
      <i class="iconfont icon-up"></i>
    </div>
  </div>
  
  <script src="https://cdn.jsdelivr.net/npm/jquery@3.2.1/dist/jquery.min.js" integrity="sha256-hwg4gsxgFZhOsEEamdOYGBf13FyQuiTwlAQgxVSNgt4=" crossorigin="anonymous"></script>
  <script src="https://cdn.jsdelivr.net/npm/slideout@1.0.1/dist/slideout.min.js" integrity="sha256-t+zJ/g8/KXIJMjSVQdnibt4dlaDxc9zXr/9oNPeWqdg=" crossorigin="anonymous"></script>
  <script src="https://cdn.jsdelivr.net/npm/@fancyapps/fancybox@3.1.20/dist/jquery.fancybox.min.js" integrity="sha256-XVLffZaxoWfGUEbdzuLi7pwaUJv1cecsQJQqGLe7axY=" crossorigin="anonymous"></script>



<script type="text/javascript" src="/js/main.min.2517c0eb67172a0bae917de4af59b10ca2531411a009d4c0b82f5685259e5771.js"></script>








</body>
</html>
